Topic Scores

This analysis analyzes Arctic Council speeches and national arctic strategy documents by comparing the relative frequencies of words belonging to ten different topics. We have defined these topics by creating dictionaries of commonly mentioned words that relate to each.

The method used here calculates the fraction of words in each document belonging to each topic to calculate a document-level score. It then averages document-level scores for each country to create a country-level score.

Preparing the data

This section loads the libraries and the text files from three different folders. It also contains some excess code. Moving forward, we use the dataframes developed from the readtext function, not the corpuses, dtms, or dfms.

This version does not remove stop words, punctuation, etc. This should not adversely affect our results, since we are using a dictionaries method to calculate our scores.

library(stm)
library(igraph)
library(tidyverse)
library(tidytext)
library(readtext)
library(quanteda)
library(dplyr)
library(stringr)
library(plotly)
library(rworldmap)

#This sets the directory where the texts are located
DATA_DIR <- "C:/Users/laura/OneDrive/Desktop/Krogh-Arctic/Strategy Documents (new)"  

#This command reads in all the file names and stores the texts in a tidy dataframe
strategy <- readtext(paste0(DATA_DIR, "/*"))

#This identifies the row names of the dataframe, which are also used in the corpus
#Note that these docnames are not as nice as for the UN Corpus as they vary a bit

row.names(strategy) <- strategy$doc_id

#This command reads in the corpus based on the filenames we defined above
strategycorpus <- corpus(strategy, text_field = "text") 



#Do again for the other folder
DATA_DIR <- "C:/Users/laura/OneDrive/Desktop/Krogh-Arctic/Observer Documents"  
observer <- readtext(paste0(DATA_DIR, "/*"))
row.names(observer) <- observer$doc_id
observercorpus <- corpus(observer, text_field = "text") 


#Third folder
DATA_DIR <- "C:/Users/laura/OneDrive/Desktop/Krogh-Arctic/Arctic Speeches"  
speeches <- readtext(paste0(DATA_DIR, "/*"))
row.names(speeches) <- speeches$doc_id
speechescorpus <- corpus(speeches, text_field = "text") 



#add them together
corpus <- strategycorpus+observercorpus+speechescorpus

dfm <- tokens(corpus) %>%
    tokens_remove("\\p{P}", valuetype = "regex", padding = TRUE) %>%
   tokens_remove("\\p{N}", valuetype = "regex", padding = TRUE) %>%
  tokens_remove("\\p{S}", valuetype = "regex", padding = TRUE) %>%
    tokens_remove(stopwords("english"), padding  = TRUE) %>%
    tokens_ngrams(n = 1:2, concatenator = " ") %>%
    dfm(verbose=FALSE)

dfm<-dfm_remove(dfm, c("also", "as well", "arctic", "council", "arctic council", "cooperation"))

Combining the data frames and restructing

#unnest the tokens (words) and create a new data frame with each word as one row
strategydf <- unnest_tokens(strategy, word, text)

observerdf <- unnest_tokens(observer, word, text)

speechesdf <- unnest_tokens(speeches, word, text)

#combine the three data frames
totaldf <- strategydf %>%
  full_join(observerdf) %>%
  full_join(speechesdf)
## Joining, by = c("doc_id", "word")
## Joining, by = c("doc_id", "word")
#calculate the document lengths
words <- totaldf %>%
  group_by(doc_id) %>%
  mutate(length=n()) %>%
  ungroup()

#separate the doc_id into country and everything that follows
words <- words %>%
  mutate(doc_id2=doc_id) %>%
  separate(doc_id2, c("country", "misc"), sep = "_") 
## Warning: Expected 2 pieces. Additional pieces discarded in 17239 rows
## [232723, 232724, 232725, 232726, 232727, 232728, 232729, 232730, 232731,
## 232732, 232733, 232734, 232735, 232736, 232737, 232738, 232739, 232740,
## 232741, 232742, ...].

Defining the topic dictionaries

There are eleven topics: 1. environment 2. indigenous 3. transport 4. development 5. tourism 6. resources 7. fisheries 8. diplomacy 9. security 10. russia

environment <- data.frame(c("Research", "science/scientific/scientist", "environment","climate","climate change","ocean","sea","sea level","atmosphere","air","ice","warm","melt","knowledge","station","base","glaciological","geological","biological","ecosystem","paleoclimate","laboratory","institution","conservation","preservation","temperature","data","measurement","study","precipitation","pollution","cryospheric","publication","biodiversity","academic", "glacier","disaster","observe","trend","predict","species","force","global warming","protect" ))
colnames(environment) <- "topic"

indigenous <- data.frame(c("nation", "local", "indigenous", "peoples", "community", "human", "social", "lives", "condition", "inhabitants", "well-being", "language", "health", "traditional", "culture", "rural", "residents"))
colnames(indigenous) <- "topic"

transport <- data.frame(c("transportation", "shipping", "import", "export", "maritime", "transport", "ship", "vessel", "navigation", "route", "channel", "northeast passage", "northwest passage", "northern sea route", "voyage", "commercial", "trade", "icebreakers", "water", "transit"))
colnames(transport) <- "topic"

development <- data.frame(c("Sustainable","development","economic","globalization","economic zones","commercial","production","strategy","benefit","capital","market","enterprise","opportunity","business","infrastructure","fund","industry"))
colnames(development) <- "topic"

tourism <- data.frame(c("tourism","tourists","rescue","ecotourism"))
colnames(tourism) <- "topic"

resources <- data.frame(c("oil","industrial/industry","resource","technology","energy","gas","carbon","infrastructure","build","exploit","mine","utilization","exploitation","natural","mineral","geothermal","wind","exploration","consumer","pipeline","extraction"))
colnames(resources) <- "topic"

fisheries <- data.frame(c("fish","fisheries","fishing","aquaculture","goods"))
colnames(fisheries) <- "topic"

diplomacy <- data.frame(c("strengthen","joint","relationship","peace","integration","cooperation","international","relations","diplomatic","contribute","parties","stability","equality","participants","connect","multilateral","bilateral","regional","global","coalition","collaboration","coordination","share","same","affairs","harmony","alliance","partnership","freedom","political","meet"))
colnames(diplomacy) <- "topic"

security <- data.frame(c("sovereignty","continentalshelf","State","nation","secure/security","stakeholder","governance","rule","UNCLOS","claim","jurisdiction","rights","interests","territory","zone","own","influence","spitsbergen","role","legal","law","just","treaties","treaty","military","defend/defense","position","independent","regulation"))
colnames(security) <- "topic"

russia <- data.frame(c("russia"))
colnames(russia) <- "topic"

Generating scores

#defining a function to calculate the scores
# first count the words belonging to each topic in each document and create document score by dividing the count by the length of the document

countwords <- function(topic) {
docscores <- words %>%
   inner_join(topic, by= c("word" = "topic")) %>%
   group_by(doc_id) %>%
  mutate(count = n(), doclength=mean(length), score=count/doclength) %>%
   ungroup()

#calculating country scores by averaging document scores
countryscores <- docscores %>%
  group_by(country) %>%
  summarize(country_score = mean(score))
}

Graphing scores

#Environment
#calcualte scores
environment_scores <- countwords(environment)
## Warning: Column `word`/`topic` joining character vector and factor,
## coercing into character vector
## Warning: package 'bindrcpp' was built under R version 3.4.4
#plot
p.environment <- ggplot(environment_scores, aes(x=reorder(country,country_score), y=country_score)) + geom_point() + coord_flip() +
  ggtitle("Environment")
#make interactive
ggplotly(p.environment)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
#Indigenous
indigenous_scores <- countwords(indigenous)
## Warning: Column `word`/`topic` joining character vector and factor,
## coercing into character vector
p.indigenous <- ggplot(indigenous_scores, aes(x=reorder(country,country_score), y=country_score)) + geom_point() + coord_flip() +
  ggtitle("Indigenous")
ggplotly(p.indigenous)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
#Transport
transport_scores <- countwords(transport)
## Warning: Column `word`/`topic` joining character vector and factor,
## coercing into character vector
p.transport <- ggplot(transport_scores, aes(x=reorder(country,country_score), y=country_score)) + geom_point() + coord_flip() +
  ggtitle("transport")
ggplotly(p.transport)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
#Development
development_scores <- countwords(development)
## Warning: Column `word`/`topic` joining character vector and factor,
## coercing into character vector
p.development <- ggplot(development_scores, aes(x=reorder(country,country_score), y=country_score)) + geom_point() + coord_flip() +
  ggtitle("Development")
ggplotly(p.development)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
#Tourism
tourism_scores <- countwords(tourism)
## Warning: Column `word`/`topic` joining character vector and factor,
## coercing into character vector
p.tourism <- ggplot(tourism_scores, aes(x=reorder(country,country_score), y=country_score)) + geom_point() + coord_flip() +
  ggtitle("Tourism")
ggplotly(p.tourism)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
#Resources
resources_scores <- countwords(resources)
## Warning: Column `word`/`topic` joining character vector and factor,
## coercing into character vector
p.resources <- ggplot(resources_scores, aes(x=reorder(country,country_score), y=country_score)) + geom_point() + coord_flip() +
  ggtitle("Resources")
ggplotly(p.resources)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
#Fisheries
fisheries_scores <- countwords(fisheries)
## Warning: Column `word`/`topic` joining character vector and factor,
## coercing into character vector
p.fisheries <- ggplot(fisheries_scores, aes(x=reorder(country,country_score), y=country_score)) + geom_point() + coord_flip() +
  ggtitle("Fisheries")
ggplotly(p.fisheries)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
#Diplomacy
diplomacy_scores <- countwords(diplomacy)
## Warning: Column `word`/`topic` joining character vector and factor,
## coercing into character vector
p.diplomacy <- ggplot(diplomacy_scores, aes(x=reorder(country,country_score), y=country_score)) + geom_point() + coord_flip() +
  ggtitle("Diplomacy")
ggplotly(p.diplomacy)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
#Security
security_scores <- countwords(security)
## Warning: Column `word`/`topic` joining character vector and factor,
## coercing into character vector
p.security <- ggplot(security_scores, aes(x=reorder(country,country_score), y=country_score)) + geom_point() + coord_flip() +
  ggtitle("Security")
ggplotly(p.security)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
#Russia
russia_scores <- countwords(russia)
## Warning: Column `word`/`topic` joining character vector and factor,
## coercing into character vector
p.russia <- ggplot(russia_scores, aes(x=reorder(country,country_score), y=country_score)) + geom_point() + coord_flip() +
  ggtitle("Russia")
ggplotly(p.russia)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

Mapping

mapscores <- function(topic) {
  name <- deparse(substitute(topic))
  joinCountryData2Map(topic, joinCode = "NAME", nameJoinColumn = "country") %>%
  mapCountryData(nameColumnToPlot = "country_score", colourPalette = "heat", addLegend = TRUE, borderCol = "grey", mapTitle = name)
}

mapscores(environment_scores)
## 22 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 221 codes from the map weren't represented in your data

mapscores(indigenous_scores)
## 22 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 221 codes from the map weren't represented in your data

mapscores(transport_scores)
## 21 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 222 codes from the map weren't represented in your data

mapscores(development_scores)
## 22 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 221 codes from the map weren't represented in your data

mapscores(tourism_scores)
## 18 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 225 codes from the map weren't represented in your data

mapscores(resources_scores)
## 22 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 221 codes from the map weren't represented in your data

mapscores(fisheries_scores)
## 18 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 225 codes from the map weren't represented in your data

mapscores(diplomacy_scores)
## 22 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 221 codes from the map weren't represented in your data

mapscores(security_scores)
## 21 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 222 codes from the map weren't represented in your data

mapscores(russia_scores)
## 18 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 225 codes from the map weren't represented in your data